The following is an example random sampling from the dataframe:
options(warn=-1)
resultsSet <- read.csv("twestbrook-hgt1-results.txt", sep = "\t")
exampleSet <- resultsSet[sample(nrow(resultsSet), 5),]
cat(sprintf("Example set: %d out of %d rows\n", 5, nrow(resultsSet)))
tail(exampleSet)
Effectiveness of phylogenetic profiling based on good profile hits minus bad hits out of total possible profile hits. Left doesn't remove ambiguous hits, while right does.
Given this, calculate the effectiveness for both host and symbiont, then plot to see correlation:
resultsSet$ResultEffectiveHost <- (resultsSet$ResultGoodHost - resultsSet$ResultBadHost) / resultsSet$ActualOriginHost
resultsSet$ResultEffectiveSym <- (resultsSet$ResultGoodSym - resultsSet$ResultBadSym) / resultsSet$ActualOriginSym
resultsSet$ResultEffectiveHostA <- (resultsSet$ResultGoodHost - resultsSet$ResultBadHost - resultsSet$ResultAmbigHost) / resultsSet$ActualOriginHost
resultsSet$ResultEffectiveSymA <- (resultsSet$ResultGoodSym - resultsSet$ResultBadSym - resultsSet$ResultAmbigSym) / resultsSet$ActualOriginSym
# Normalize from -1:1 to 0:1
resultsSet$ResultEffectiveHost <- (resultsSet$ResultEffectiveHost + 1) / 2
resultsSet$ResultEffectiveSym <- (resultsSet$ResultEffectiveSym + 1) / 2
resultsSet$ResultEffectiveHostA <- (resultsSet$ResultEffectiveHostA + 1) / 2
resultsSet$ResultEffectiveSymA <- (resultsSet$ResultEffectiveSymA + 1) / 2
library(gridExtra)
library(ggplot2)
options(repr.plot.width = 15, repr.plot.height = 15)
p1 <- ggplot(resultsSet, aes(ResultEffectiveHost, ResultEffectiveSym, color=ParamSimMutation)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p2 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveSymA, color=ParamSimMutation)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p3 <- ggplot(resultsSet, aes(ResultEffectiveHost, ResultEffectiveSym, color=ParamSimHGT)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p4 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveSymA, color=ParamSimHGT)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p5 <- ggplot(resultsSet, aes(ResultEffectiveHost, ResultEffectiveSym, color=ParamSimDrop)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p6 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveSymA, color=ParamSimDrop)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
p1 <- ggplot(resultsSet, aes(ResultEffectiveHost, ResultEffectiveSym, color=ParamTestDistance)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p2 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveSymA, color=ParamTestDistance)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p3 <- ggplot(resultsSet, aes(ResultEffectiveHost, ResultEffectiveSym, color=ParamTestTaxa)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p4 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveSymA, color=ParamTestTaxa)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p5 <- ggplot(resultsSet, aes(ResultEffectiveHost, ResultEffectiveSym, color=ParamTestProtein)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
p6 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveSymA, color=ParamTestProtein)) + geom_point(alpha = 1/20) + scale_colour_gradient(low="black", high="green")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
Look at the correlation across specific simulation combinations.
First row: All three rates at 0.1, 0.3, and 0.5 percent
Each subsequent row - Two rates at 0.1, 0.3, and 0.5 with the third rate fixed at 0.1. (In order: Mutation, HGT, then Drop)
options(repr.plot.width = 10, repr.plot.height = 3)
p1 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.1 & resultsSet$ParamSimHGT == 0.1 & resultsSet$ParamSimDrop == 0.1,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
p2 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.3 & resultsSet$ParamSimHGT == 0.3 & resultsSet$ParamSimDrop == 0.3,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
p3 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.5 & resultsSet$ParamSimHGT == 0.5 & resultsSet$ParamSimDrop == 0.5,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
grid.arrange(p1, p2, p3, ncol = 3, nrow = 1)#, widths=c(1,1,1))
Mutation rate fixed at 0.1
p1 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.1 & resultsSet$ParamSimHGT == 0.1 & resultsSet$ParamSimDrop == 0.1,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
p2 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.1 & resultsSet$ParamSimHGT == 0.3 & resultsSet$ParamSimDrop == 0.3,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
p3 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.1 & resultsSet$ParamSimHGT == 0.5 & resultsSet$ParamSimDrop == 0.5,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
grid.arrange(p1, p2, p3, ncol = 3, nrow = 1)
HGT rate fixed at 0.1
p1 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.1 & resultsSet$ParamSimHGT == 0.1 & resultsSet$ParamSimDrop == 0.1,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
p2 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.3 & resultsSet$ParamSimHGT == 0.1 & resultsSet$ParamSimDrop == 0.3,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
p3 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.5 & resultsSet$ParamSimHGT == 0.1 & resultsSet$ParamSimDrop == 0.5,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
grid.arrange(p1, p2, p3, ncol = 3, nrow = 1)
Drop rate fixed at 0.1
p1 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.1 & resultsSet$ParamSimHGT == 0.1 & resultsSet$ParamSimDrop == 0.1,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
p2 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.3 & resultsSet$ParamSimHGT == 0.3 & resultsSet$ParamSimDrop == 0.1,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
p3 <- ggplot(resultsSet[resultsSet$ParamSimMutation == 0.5 & resultsSet$ParamSimHGT == 0.5 & resultsSet$ParamSimDrop == 0.1,], aes(ResultEffectiveHostA, ResultEffectiveSymA)) + geom_point(alpha = 1/10)
grid.arrange(p1, p2, p3, ncol = 3, nrow = 1)
Now compensate for ambiguous hits as well, as view effect
Accounting for ambiguous cases seems to add a senario (given some simulation/testing parameters) that causes a negative correlation. Investigate this
Calculate effective HGT by taking good HGT hits minux bad HGT hits out of all possible HGT events. Then plot in relation to both the host profile and the symbiont profile
resultsSet$ResultEffectiveHGT <- (resultsSet$ResultHGTGood - resultsSet$ResultHGTBad - resultsSet$ResultHGTAmbig) / resultsSet$ActualHGT
resultsSet$ResultEffectiveHGT <- (resultsSet$ResultEffectiveHGT + 1) / 2
options(repr.plot.width = 15, repr.plot.height = 15)
p1 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveHGT, color=ParamSimMutation)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p2 <- ggplot(resultsSet, aes(ResultEffectiveSymA, ResultEffectiveHGT, color=ParamSimMutation)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p3 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveHGT, color=ParamSimHGT)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p4 <- ggplot(resultsSet, aes(ResultEffectiveSymA, ResultEffectiveHGT, color=ParamSimHGT)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p5 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveHGT, color=ParamSimDrop)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p6 <- ggplot(resultsSet, aes(ResultEffectiveSymA, ResultEffectiveHGT, color=ParamSimDrop)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
p1 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveHGT, color=ParamTestDistance)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p2 <- ggplot(resultsSet, aes(ResultEffectiveSymA, ResultEffectiveHGT, color=ParamTestDistance)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p3 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveHGT, color=ParamTestTaxa)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p4 <- ggplot(resultsSet, aes(ResultEffectiveSymA, ResultEffectiveHGT, color=ParamTestTaxa)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p5 <- ggplot(resultsSet, aes(ResultEffectiveHostA, ResultEffectiveHGT, color=ParamTestProtein)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
p6 <- ggplot(resultsSet, aes(ResultEffectiveSymA, ResultEffectiveHGT, color=ParamTestProtein)) + geom_point(alpha = 1/10) + scale_colour_gradient(low="black", high="green")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
Condense effectiveness into a single value by averaging these two values. Isolate the parameters and this total effectiveness into a new dataframe. Then show a correlation matrix, and pairwise comparisons for simulation and testing parameters against total effectiveness.
parameterSet <- data.frame(resultsSet$ParamSimMutation,
resultsSet$ParamSimHGT,
resultsSet$ParamSimDrop,
resultsSet$ParamTestDistance,
resultsSet$ParamTestTaxa,
resultsSet$ParamTestProtein,
resultsSet$ResultEffectiveHostA)
cor(parameterSet, use="complete")
parameterSet <- data.frame(resultsSet$ParamSimMutation,
resultsSet$ParamSimHGT,
resultsSet$ParamSimDrop,
resultsSet$ParamTestDistance,
resultsSet$ParamTestTaxa,
resultsSet$ParamTestProtein,
resultsSet$ResultEffectiveSymA)
cor(parameterSet, use="complete")
resultsSet$ResultEffectiveCombined <- (resultsSet$ResultEffectiveHostA + resultsSet$ResultEffectiveSymA) / 2
parameterSet <- data.frame(resultsSet$ParamSimMutation,
resultsSet$ParamSimHGT,
resultsSet$ParamSimDrop,
resultsSet$ParamTestDistance,
resultsSet$ParamTestTaxa,
resultsSet$ParamTestProtein,
resultsSet$ResultEffectiveCombined)
cor(parameterSet, use="complete")
Testing to see distributions of total effectiveness by parameter type
p1 <- ggplot(resultsSet, aes(ParamSimMutation, ResultEffectiveCombined, group=cut_width(ParamSimMutation, 0.10))) + geom_boxplot(outlier.color=alpha("black", 1/100))
p2 <- ggplot(resultsSet, aes(ParamSimHGT, ResultEffectiveCombined, group=cut_width(ParamSimHGT, 0.10))) + geom_boxplot(outlier.color=alpha("black", 1/100))
p3 <- ggplot(resultsSet, aes(ParamSimDrop, ResultEffectiveCombined, group=cut_width(ParamSimDrop, 0.10))) + geom_boxplot(outlier.color=alpha("black", 1/100))
p4 <- ggplot(resultsSet, aes(ParamTestDistance, ResultEffectiveCombined, group=cut_width(ParamTestDistance, 0.10))) + geom_boxplot(outlier.color=alpha("black", 1/100))
p5 <- ggplot(resultsSet, aes(ParamTestTaxa, ResultEffectiveCombined, group=cut_width(ParamTestTaxa, 0.10))) + geom_boxplot(outlier.color=alpha("black", 1/100))
p6 <- ggplot(resultsSet, aes(ParamTestProtein, ResultEffectiveCombined, group=cut_width(ParamTestProtein, 0.10))) + geom_boxplot(outlier.color=alpha("black", 1/100))
options(repr.plot.width = 10, repr.plot.height = 6)
grid.arrange(p1, p2, p3, p4, p5, p6, nrow=2, ncol=3)
chartList = sapply(c(0.2, 0.4, 0.6, 0.8), function(activeTaxa) {
rowList = lapply(c(0.2, 0.4, 0.6, 0.8), function (activeProtein) {
resultsSubset <- resultsSet[resultsSet$ParamTestTaxa == activeTaxa & resultsSet$ParamTestProtein == activeProtein & resultsSet$ParamSimDrop == 0.1 & resultsSet$ParamTestDistance == 0.7,]
effectiveColors <- rgb(resultsSubset$ResultEffectiveHostA, ifelse(is.nan(resultsSubset$ResultEffectiveSymA),0,resultsSubset$ResultEffectiveSymA), 0)
#effectiveColors <- hsv(resultsSubset$ResultEffectiveHostA, ifelse(is.nan(resultsSubset$ResultEffectiveSymA),0,resultsSubset$ResultEffectiveSymA) , 0.5 )
hgtColors <- rgb(0, 0, ifelse(is.nan(resultsSubset$ResultHGTAmbig/resultsSubset$ActualHGT), 0, resultsSubset$ResultHGTAmbig/resultsSubset$ActualHGT))
activePlot <- ggplot(resultsSubset, aes(x=ParamSimMutation, y=ParamSimHGT)) +
geom_tile(color="white", fill=effectiveColors) +
geom_point(size=resultsSubset$ResultEffectiveHGT * 4, shape=16, color="blue", alpha=0.5) +
theme(axis.title=element_text(size=8))
if (activeProtein != 0.2) activePlot <- activePlot + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(),)
if (activeTaxa != 0.8) activePlot <- activePlot + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),)
activePlot
})
})
options(repr.plot.width = 10, repr.plot.height = 6)
do.call("grid.arrange", c(chartList, nrow=4, ncol=4, left = "Testing: Taxa", bottom = "Testing: Protein"))
library(reshape2)
resultsSubset1 <- resultsSet[resultsSet$ResultEffectiveHostA > 0.80 & resultsSet$ResultEffectiveSymA > 0.80 & resultsSet$ResultEffectiveHGT > 0.80,]
focusSubset <- data.frame(resultsSubset1$ParamTestDistance, resultsSubset1$ParamTestTaxa, resultsSubset1$ParamTestProtein)
options(repr.plot.width = 4, repr.plot.height = 4)
ggplot(melt(focusSubset), aes(x = variable, y = value)) + geom_boxplot() +
scale_x_discrete(labels=c("Distance","Taxa","Protein")) +
scale_y_continuous(breaks = pretty(resultsSubset1$ParamTestDistance, n = 10))
resultsBaseSubset <- resultsSet[resultsSet$ParamTestDistance == 0.6 & resultsSet$ParamSimMutation <= 0.2 & resultsSet$ParamSimHGT > 0 & resultsSet$ParamSimHGT <= 0.3,]
chartList = lapply(c(0.4, 0.5, 0.6), function(activeTaxa) {
resultsSubset1 <- resultsBaseSubset[resultsSet$ParamTestTaxa == activeTaxa & resultsSet$ParamTestProtein == 0.4,]
resultsSubset2 <- resultsBaseSubset[resultsSet$ParamTestTaxa == activeTaxa & resultsSet$ParamTestProtein == 0.5,]
resultsSubset3 <- resultsBaseSubset[resultsSet$ParamTestTaxa == activeTaxa & resultsSet$ParamTestProtein == 0.6,]
activePlot <- ggplot() +
geom_point(data=resultsSubset1, aes(ResultEffectiveHostA, ResultEffectiveHGT, color="0.4"), alpha=1/3) +
geom_point(data=resultsSubset2, aes(ResultEffectiveHostA, ResultEffectiveHGT, color="0.5"), alpha=1/3) +
geom_point(data=resultsSubset3, aes(ResultEffectiveHostA, ResultEffectiveHGT, color="0.6"), alpha=1/3) +
stat_smooth(data=resultsSubset1, aes(ResultEffectiveHostA, ResultEffectiveHGT), method = "loess", formula = y ~ x, size = 1, col = "red") +
stat_smooth(data=resultsSubset2, aes(ResultEffectiveHostA, ResultEffectiveHGT), method = "loess", formula = y ~ x, size = 1, col = "green") +
stat_smooth(data=resultsSubset3, aes(ResultEffectiveHostA, ResultEffectiveHGT), method = "loess", formula = y ~ x, size = 1, col = "blue") +
coord_cartesian(xlim = c(0.5, 1), ylim = c(0, 1)) +
scale_color_manual("Protein", values=c("red", "green", "blue"))
if (activeTaxa != 0.4) activePlot <- activePlot + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(),)
if (activeTaxa != 0.6) activePlot <- activePlot + theme(legend.position="none")
activePlot
})
options(repr.plot.width = 15, repr.plot.height = 5)
do.call("grid.arrange", c(chartList, nrow=1, ncol=3, bottom = "Testing: Taxa"))